Edited By Felicia Luo
-What factors affect MMF yields and AUM/Flows? How? (Fund size, Fed reate, etc.)
-Which rating agencies (Moody's, Fitch, SP500) is important for predicting AUM/Flows?
-Do AUM have seasonalities? How does Fed Rate influnce portfolio AUM?Using historical portfolio AUM to forcast future AUM trends
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import sagemaker
sess = sagemaker.Session()
import boto3
import datetime
import warnings
warnings.filterwarnings('ignore')
import warnings
warnings.filterwarnings('ignore')
#connet DB using sqlalchemy
#create_engine('postgresql://username:password@host:post/database name')
engine = create_engine('postgresql://wluo:JAFe!r5KGWRg#N$s257&@lm-edm-prod-product.czji6gyk9wtb.us-east-1.redshift.amazonaws.com:5439/edmprod')
#read sql
#data_mmf = pd.read_sql_query("""select * from pdc.mmkt_fund_intel_dly""",engine)
#read pickle file - mmkt funf intel dly
#data_mmf.to_pickle("./mmf.pkl")
edh_mm = pd.read_pickle("./mmf.pkl")
#read EFFR from S3 Bucket
bucket ="lm-edm-product-prod"
s3 = boto3.client('s3')
file_name ="032005to032020_EFFR.csv"
obj = s3.get_object(Bucket=bucket, Key = file_name)
fed_rate = pd.read_csv(obj['Body'])
fed_rate = fed_rate[fed_rate.EFFR != '.']
#read CNFC from S3 Bucket
file_name_1 ="032005to032020_CNCF.csv"
obj_1 = s3.get_object(Bucket=bucket, Key = file_name_1)
cncf = pd.read_csv(obj_1['Body'])
#read fund_imoneynet from S3 Bucket
file_name_2 = "Fund_Yield_-ad_hoc.xlsx"
obj_2 = s3.get_object(Bucket=bucket, Key = file_name_2)
df_data_1 = pd.read_excel(obj_2['Body'],header=1)
#change data type
fed_rate['DATE'] = pd.to_datetime(fed_rate['DATE'])
cncf['DATE'] = pd.to_datetime(cncf['DATE'])
df_data_1['Date'] = pd.to_datetime(df_data_1['Date'], dayfirst=True, errors='coerce')
fed_rate['EFFR'] = fed_rate['EFFR'].astype('float')
#bucket ="lm-edm-product-prod"
#s3 = boto3.client('s3')
#file_name_2 = "Fund_Yield_-ad_hoc.xlsx"
#obj_2 = s3.get_object(Bucket=bucket, Key = file_name_2)
#fund_imoneynet = pd.read_excel(obj_2['Body'])
edh_mm_list = ['assets','chg1d','chg7d','gr1d','gr30d','inflow','outflow','netflow','wam','wal','one_day_percent','seven_day_percent',
'thirty_day','div_factor','mnav','dla','exp_percent','ytdchg','ass12_31','seven_d12_31','cnav','cusip','money_fund','as_of_dt']
edh_mm = edh_mm[edh_mm_list]
edh_mm = edh_mm.loc[:,~edh_mm.columns.duplicated()]
edh_mm['as_of_dt'] = pd.to_datetime(edh_mm['as_of_dt'])
df_data = df_data_1
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#import pyfolio as pf
import seaborn as sns
import math
plt.style.use("bmh")
from pandas.plotting import scatter_matrix
print("Preview - mmkt_fund_intel_dly data")
print("===================================")
edh_mm.describe()
print("Preview - fed rate data")
print("=======================")
#print(fed_rate)
fed_rate.head()
fed_rate=fed_rate.replace('-',np.nan)
fed_rate=fed_rate.fillna(method='ffill')
fed_rate=fed_rate.fillna(0)
print("Preview - CNCF")
print("=======================")
cncf.head()
print("Preview - iMoneyNet")
print("=======================")
df_data.head()
def check_flag(df,dt_column = 'EFFR'):
_flag = []
for ix, row in df.iterrows():
if row[dt_column] <= 0.25:
_flag.append('Low EFFR')
else:
_flag.append('High EFFR')
return _flag
fed_rate['flag']=check_flag(fed_rate)
df_data.drop(['Unnamed: 0'],axis =1,inplace=True)
df_data['Date'] = pd.to_datetime(df_data['Date'])
fed_rate['Date'] = pd.to_datetime(fed_rate['DATE'])
#switch the string data to numbers
#df_data=df_data.iloc[:,10:29]
df_data.iloc[:,10:29]=df_data.iloc[:,10:29].replace('-',np.nan)
df_data.iloc[:,10:29]=df_data.iloc[:,10:29].fillna(method='ffill')
df_data.iloc[:,10:29]=df_data.iloc[:,10:29].fillna(0)
df_data.iloc[:,30]=df_data.iloc[:,30].replace('-',np.nan)
df_data.iloc[:,30]=df_data.iloc[:,30].fillna(method='ffill')
df_data.iloc[:,30]=df_data.iloc[:,30].fillna(0)
for col in list(df_data.columns):
if col in ['Portfolio Assets (mo/$mils)','1-DSY (dly)','3-Mo (NSAY)','1-Yr (NSAY)','5-Yr (NSAY)','10-Yr (NSAY)','Incd 12b-1 Fee (mo)','Incd Expense Ratio (mo)','Incd Mgmt Fee (mo)','Incd Other Fees (mo)','Incd Shr. Svc. Fee (mo)']:
df_data[col].fillna(method='ffill')
pd.to_numeric(df_data[col],errors='coerce')
#print(df_data.dtypes)
#create a func that ensures the data type is float
def ensure_float(x):
if isinstance(x,np.float):
return x
else :
return 0
#add a column to determine if is 12b-1 or non12b-1, positive netflow or negative netflow
df_data['Incd 12b-1 Fee (mo)']=df_data['Incd 12b-1 Fee (mo)'].replace('-',np.nan)
df_data['Incd 12b-1 Fee (mo)']=df_data['Incd 12b-1 Fee (mo)'].fillna(0)
df_data[['1-Yr (NCAGR)','Share Class Assets (mo/$mils)','Share Class Assets Chg (mo/$mils)']]=df_data[['1-Yr (NCAGR)','Share Class Assets (mo/$mils)','Share Class Assets Chg (mo/$mils)']].replace('-',np.nan)
df_data[['1-Yr (NCAGR)','Share Class Assets (mo/$mils)','Share Class Assets Chg (mo/$mils)']]=df_data[['1-Yr (NCAGR)','Share Class Assets (mo/$mils)','Share Class Assets Chg (mo/$mils)']].fillna(method='ffill')
df_data['Share Class Assets Chg (mo/$mils)']=df_data['Share Class Assets Chg (mo/$mils)'].apply(ensure_float)
df_data['1-Yr (NCAGR)']=df_data['1-Yr (NCAGR)'].apply(ensure_float)
df_data['Share Class Assets (mo/$mils)']=df_data['Share Class Assets (mo/$mils)'].apply(ensure_float)
df_data['Incd 12b-1 Fee (mo)']=df_data['Incd 12b-1 Fee (mo)'].apply(ensure_float)
df_data['12b_1']=np.where(df_data['Incd 12b-1 Fee (mo)']>0,'yes','no')
# add lagged share class asset for cagr
df_data['sca_lagged'] = (df_data.sort_values(by=['Date'], ascending=True)
.groupby(['Fund Name'])['Share Class Assets (mo/$mils)'].shift(12))
df_data['fund_cagr'] = df_data['Share Class Assets (mo/$mils)']/df_data['sca_lagged']-1
df_data['assets_rank']=df_data.groupby(['SubCategory Current','Date','12b_1'])['Share Class Assets (mo/$mils)'].rank(ascending=False)
df_data['fund_size_large']=np.where(df_data['assets_rank']>10,'no','yes')
#Create a subset for rating analysis
df_data[['Fitch Rating',"Moody's Rating",'S&P Rating']]=df_data[['Fitch Rating',"Moody's Rating",'S&P Rating']].replace('-','No Rating')
df_data_rating=df_data[['SubCategory Current','Fitch Rating',"Moody's Rating",'S&P Rating','Share Class Assets Chg (mo/$mils)','1-Yr (NCAGR)','fund_cagr']]
#Describe the 3 kind of rating
describe_rating_FR_1=df_data_rating.groupby(['SubCategory Current','Fitch Rating'])['Share Class Assets Chg (mo/$mils)'].describe()
describe_rating_MR_1=df_data_rating.groupby(['SubCategory Current',"Moody's Rating"])['Share Class Assets Chg (mo/$mils)'].describe()
describe_rating_SR_1=df_data_rating.groupby(['SubCategory Current',"S&P Rating"])['Share Class Assets Chg (mo/$mils)'].describe()
#describe_rating_SC=df_data_rating.groupby("SubCategory Current")['Share Class Assets Chg (mo/$mils)'].describe()
#describe_rating_FR_1
describe_rating_FR_cagr=df_data_rating.groupby(['SubCategory Current','Fitch Rating'])[['fund_cagr']].describe()
describe_rating_MR_cagr=df_data_rating.groupby(['SubCategory Current',"Moody's Rating"])['1-Yr (NCAGR)'].describe()
describe_rating_SR_cagr=df_data_rating.groupby(['SubCategory Current',"S&P Rating"])['1-Yr (NCAGR)'].describe()
#describe_rating_MR_cagr
full_table = df_data
full_table=full_table.replace('-',np.nan)
full_table=full_table.fillna(method='ffill')
full_table=full_table.fillna(0)
cor=np.corrcoef(
full_table['EFFR'],
full_table['1-Yr (NCAGR)'])
cor
sns.set(style='whitegrid', context = 'notebook')
cols = ['assets','chg1d','chg7d','netflow','div_factor']
#ploting histogram to test data balance and feature distribution
#edh_eda.hist('asset_usd',bins=50)
#edh_eda.hist('sharpe_36m',bins=50)
#edh_eda.hist('std_dev_36m',bins=50)
#edh_eda['fund_type_d'].value_counts().plot(kind='bar')
#scatter_matrix(edh_mm[cols],alpha=0.2,figsize=(6,6),diagonal='kde')
full_table=pd.merge(fed_rate,full_table, left_on =['DATE'], right_on = ['Date'])
#print(edh_mm.isnull().any())
#edh_mm.describe()
#create a subset from edh_mm that only contains cusip and net flow
edh_mm_sub = edh_mm[['cusip','money_fund','as_of_dt','netflow']]
edh_mm_sub = edh_mm_sub.dropna(subset=['cusip','netflow'])
#edh_mm_sub.loc[edh_mm_sub['netflow']==0]
#merge edh_mm with iMoneynet
full_table_mm=pd.merge(edh_mm_sub,full_table, left_on =['as_of_dt','cusip'], right_on = ['DATE','CUSIP'])
full_table_mm=full_table_mm.replace('-',np.nan)
full_table_mm=full_table_mm.fillna(method='ffill')
full_table_mm=full_table_mm.fillna(0)
full_table_mm['target']=np.where(full_table_mm['netflow']>0,'yes','no')
#full_table_mm
from collections import Counter
sp_rating = Counter(fund_imoneynet_test['S&P Rating'])
labels = sp_rating.keys()
sizes = sp_rating.values()
f, axes = plt.subplots(1,2,figsize=(14,4))
sns.countplot(fund_imoneynet_test['S&P Rating'],ax= axes[0], palette = "Set1")
axes[0].set_xlabel('S&P Rating', fontsize = 14)
axes[0].set_ylabel('Count', fontsize = 14)
axes[0].yaxis.tick_left()
sns.violinplot(x = 'S&P Rating', y= '1-Yr (NCAGR)', data = fund_imoneynet_test, ax = axes[1])
axes[1].set_xlabel('S&P Rating', fontsize = 14)
axes[1].set_ylabel('1-yr NCAGR', fontsize = 14)
axes[1].yaxis.tick_right()
plt.show
fitch_rating = Counter(fund_imoneynet_test['Fitch Rating'])
labels = fitch_rating.keys()
sizes = fitch_rating.values()
f, axes = plt.subplots(1,2,figsize=(14,4))
sns.countplot(fund_imoneynet_test['Fitch Rating'],ax= axes[0], palette = "Set1")
axes[0].set_xlabel('Fitch Rating', fontsize = 14)
axes[0].set_ylabel('Count', fontsize = 14)
axes[0].yaxis.tick_left()
sns.violinplot(x = 'Fitch Rating', y= '1-Yr (NCAGR)', data = fund_imoneynet_test, ax = axes[1])
axes[1].set_xlabel('Fitch Rating', fontsize = 14)
axes[1].set_ylabel('1-yr NCAGR', fontsize = 14)
axes[1].yaxis.tick_right()
plt.show
moody_rating = Counter(fund_imoneynet_test['Moodys Rating'])
labels = fitch_rating.keys()
sizes = fitch_rating.values()
f, axes = plt.subplots(1,2,figsize=(14,4))
sns.countplot(fund_imoneynet_test['Moodys Rating'],ax= axes[0], palette = "Set1")
axes[0].set_xlabel('Moodys Rating', fontsize = 14)
axes[0].set_ylabel('Count', fontsize = 14)
axes[0].yaxis.tick_left()
sns.violinplot(x = 'Moodys Rating', y= '1-Yr (NCAGR)', data = fund_imoneynet_test, ax = axes[1])
axes[1].set_xlabel('Moodys Rating', fontsize = 14)
axes[1].set_ylabel('1-yr NCAGR', fontsize = 14)
axes[1].yaxis.tick_right()
plt.show
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.dates as mdates
import gc
import pylab
full_table['ncagr']= full_table['1-Yr (NCAGR)']
first_tier_inst=full_table.loc[full_table['SubCategory Current']=='First Tier Instit']
first_tier_re=full_table.loc[full_table['SubCategory Current']=='First Tier Retail']
rolling_mean = first_tier_inst.ncagr.rolling(window=30).mean()
rolling_mean_re = first_tier_re.ncagr.rolling(window=30).mean()
f, ax = plt.subplots(figsize=(20,10))
ax2 = plt.twinx()
full_table.plot(
kind='line',
x='DATE',
y='EFFR',
ax=ax,
label='ERRF',
color='maroon'
)
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
plt.xlabel("Time")
plt.ylabel("1-year NCAGR and Fed Rate")
f.autofmt_xdate()
plt.plot(first_tier_inst.DATE,
rolling_mean,
label ='first_tier_inst_cagr',
color='green'
)
plt.plot(first_tier_re.DATE,
rolling_mean_re,
label = 'first_tier_retail_cagr',
color='blue'
)
plt.legend(loc='upper left')
plt.show()
plt.close()
govt_agen_inst=full_table.loc[full_table['SubCategory Current']=='Govt & Agencies Instit']
govt_agen_re=full_table.loc[full_table['SubCategory Current']=='Govt & Agency Retail']
rolling_mean = govt_agen_inst.ncagr.rolling(window=30).mean()
rolling_mean_re = govt_agen_re.ncagr.rolling(window=30).mean()
f, ax = plt.subplots(figsize=(20,10))
ax2 = plt.twinx()
full_table.plot(
kind='line',
x='DATE',
y='EFFR',
ax=ax,
label='ERRF',
color='maroon'
)
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
plt.xlabel("Time")
plt.ylabel("1-year NCAGR and Fed Rate")
f.autofmt_xdate()
plt.plot(govt_agen_inst.DATE,
rolling_mean,
label ='govt_agen_inst_cagr',
color='green'
)
plt.plot(govt_agen_re.DATE,
rolling_mean_re,
label = 'govt_agen_retail_cagr',
color='blue'
)
plt.legend(loc='upper left')
plt.show()
tf_national_inst=full_table.loc[full_table['SubCategory Current']=='T-F National Inst']
tf_national_re=full_table.loc[full_table['SubCategory Current']=='T-F National Retail']
rolling_mean = tf_national_inst.ncagr.rolling(window=30).mean()
rolling_mean_re = tf_national_re.ncagr.rolling(window=30).mean()
f, ax = plt.subplots(figsize=(20,10))
ax2 = plt.twinx()
full_table.plot(
kind='line',
x='DATE',
y='EFFR',
ax=ax,
label='ERRF',
color='maroon'
)
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
plt.xlabel("Time")
plt.ylabel("1-year NCAGR and Fed Rate")
f.autofmt_xdate()
plt.plot(tf_national_inst.DATE,
rolling_mean,
label ='tf_national_inst_cagr',
color='green'
)
plt.plot(tf_national_re.DATE,
rolling_mean_re,
label = 'tf_national_retail_cagr',
color='blue'
)
plt.legend(loc='upper left')
plt.show()
treas_inst=full_table.loc[full_table['SubCategory Current']=='Treasury Instit']
treas_re=full_table.loc[full_table['SubCategory Current']=='Treasury Retail']
rolling_mean = treas_inst.ncagr.rolling(window=30).mean()
rolling_mean_re =treas_re.ncagr.rolling(window=30).mean()
f, ax = plt.subplots(figsize=(20,10))
ax2 = plt.twinx()
full_table.plot(
kind='line',
x='DATE',
y='EFFR',
ax=ax,
label='ERRF',
color='maroon'
)
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
plt.xlabel("Time")
plt.ylabel("1-year NCAGR and Fed Rate")
f.autofmt_xdate()
plt.plot(treas_inst.DATE,
rolling_mean,
label ='treas_inst_cagr',
color='green'
)
plt.plot(treas_re.DATE,
rolling_mean_re,
label = 'treas_retail_cagr',
color='blue'
)
plt.legend(loc='upper left')
plt.show()
full_table['ncagr']= full_table['1-Yr (NCAGR)']
tf_national_inst=full_table.loc[full_table['SubCategory Current']=='T-F National Inst']
tf_national_re=full_table.loc[full_table['SubCategory Current']=='T-F National Retail']
rolling_mean = tf_national_inst.ncagr.rolling(window=15).mean()
rolling_mean_re = tf_national_re.ncagr.rolling(window=15).mean()
f, ax = plt.subplots(figsize=(20,10))
ax2 = plt.twinx()
tf_national_inst.plot(
kind='line',
x='DATE',
y='EFFR',
ax=ax,
label='ERRF',
color='maroon'
)
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
plt.xlabel("Time")
plt.ylabel("1-year NCAGR and Fed Rate")
f.autofmt_xdate()
plt.plot(tf_national_inst.DATE,
rolling_mean,
label ='tf_national_inst_cagr',
color='green'
)
plt.plot(tf_national_re.DATE,
rolling_mean_re,
label = 'tf_national_retail_cagr',
color='blue'
)
plt.legend(loc='upper left')
plt.show()
low_effr = full_table.loc[full_table['flag'] == 'Low EFFR',:]
high_effr = full_table.loc[full_table['flag'] == 'High EFFR',:]
from scipy.stats import ttest_ind
from scipy import stats
from subprocess import check_output
from sklearn.preprocessing import StandardScaler
import math
ft_mm_table = full_table_mm
#ft_mm_table = ft_mm_table.set_index('SubCategory Current',inplace = True)
#
ft_mm_table=full_table_mm.loc[['First Tier Retail','First Tier Instit']]
ft_mm_table=ft_mm_table[['1-Yr (NCAGR)','30-DSY (wk/mo)', 'Portfolio Assets (mo/$mils)','netflow', '1-DSY (dly)', 'Incd Expense Ratio (mo)', 'Incd Mgmt Fee (mo)', 'Incd Other Fees (mo)', 'Incd Shr. Svc. Fee (mo)','fund_size_large','year','month','Fitch Rating', "Moody's Rating", 'S&P Rating',
'flag','target']]
mm_matrix=pps.matrix(ft_mm_table)
def heatmap(df):
ax = sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
ax.set_title('First Tier-PPS matrix')
ax.set_xlabel('feature')
ax.set_ylabel('target')
return ax
sns.set(font_scale=1.4)
plt.subplots(figsize=(25,20))
heatmap(mm_matrix)
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score, RepeatedKFold, train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
pd.options.display.max_columns = None
from rfpimp import permutation_importances
ft_mm_table_sub = ft_mm_table.drop(['target'],axis =1)
#creat LinearRegression dataset
x = ft_mm_table_sub.drop(["netflow"],axis =1)
y = ft_mm_table_sub['netflow']
categorical = [col for col in x.columns if x[col].dtypes == 'O']
numerical = [col for col in x.columns if x[col].dtypes != 'O']
# encode categorical variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['flag','Fitch Rating', "Moody's Rating", 'S&P Rating', 'fund_size_large'
])
x = encoder.fit_transform(x)
#split dataset
np.random.seed(seed=42)
x['random']=np.random.random(size=len(x))
x_train,x_valid,y_train,y_valid = train_test_split(x,y,test_size = 0.8, random_state =101)
rf = RandomForestRegressor(n_estimators = 1500,
n_jobs = -1,
oob_score = True,
bootstrap = True,
random_state = 80,
max_features = 'auto',
max_leaf_nodes = 50)
rf.fit(x_train,y_train)
print('R^2 training score: {:.2f} \n00B Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(x_train,y_train),
rf.oob_score_,
rf.score(x_valid,y_valid)))
The training score seems acceptable but the validation score indicates overfitting (low bias but high variance). Try scale/normalize the features and prevent from capturing the noise of the data.
#normalize features
from sklearn.preprocessing import RobustScaler,MinMaxScaler
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)
cols = x.columns
scaled_x = pd.DataFrame(scaled_x, columns=[cols])
#scaled_x
score_lr = cross_val_score(LinearRegression(),scaled_x,y,cv=RepeatedKFold(n_repeats = 10))
score_rf = cross_val_score(RandomForestRegressor(n_estimators = 100),scaled_x,y,cv = RepeatedKFold(n_repeats = 5))
print("\nRandom Forest Mean Validation Score:",np.mean(score_rf))
Mean validation score boost to 0.66 with scaled features
def r2(rf,x_train,y_train):
return r2_score(y_train,rf.predict(x_train))
perm_importance_rfpimp = permutation_importances(rf,x_train,y_train,r2)
perm_importance_rfpimp.reset_index(drop = False, inplace = True)
perm_importance_rfpimp
q_hi = ft_mm_table["netflow"].quantile(0.99)
q_low = ft_mm_table["netflow"].quantile(0.01)
ft_mm_table=ft_mm_table[(ft_mm_table["netflow"]<q_hi) & (ft_mm_table["netflow"]>q_low) ]
H0:there is no difference between large funds and small funds in terms of netflow
H1: netflow is different with large and small funds
we want to know whether average of each group is significantly with the other group.
The main idea of t-test is to check whether the observed value is strong than the variation on the data.
#t-test on Large fund vs. Samll fund
large_fund = ft_mm_table.loc[ft_mm_table['fund_size_large'] == 'yes',:] #define large fund df
small_fund = ft_mm_table.loc[ft_mm_table['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
there is a relationship (difference is significant) between large_fund Net Flow and small_fund Net Flow since the p-value is <0.05
ax = plt.subplots(figsize = (18,8))
#plot the small fund ncagr distribution
sns.distplot(small_fund['netflow'],bins=20, hist = True, label = 'small fund')
#plot the large fund ncagr distribution
sns.distplot(large_fund['netflow'],bins=20, hist = True, label = 'large fund')
plt.legend() #show legend
#boxplot - test assumption
ax = sns.boxplot(y="fund_size_large",x="netflow",data=ft_mm_table,orient="h")
first_tier_re=full_table_mm.loc[full_table_mm['SubCategory Current']=='First Tier Retail']
q_hi = first_tier_re["netflow"].quantile(0.99)
q_low = first_tier_re["netflow"].quantile(0.01)
first_tier_re=first_tier_re[(first_tier_re["netflow"]<q_hi) & (first_tier_re["netflow"]>q_low)]
#t-test on Large fund vs. Samll fund
large_fund = first_tier_re.loc[first_tier_re['fund_size_large'] == 'yes',:] #define large fund df
small_fund = first_tier_re.loc[first_tier_re['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
ax = plt.subplots(figsize = (18,8))
#plot the small fund ncagr distribution
sns.distplot(small_fund['netflow'],bins=20, hist = True, label = 'small fund')
#plot the large fund ncagr distribution
sns.distplot(large_fund['netflow'],bins=20, hist = True, label = 'large fund')
plt.legend() #show legend
#boxplot - test assumption
ax = sns.boxplot(y="fund_size_large",x="netflow",data=first_tier_re,orient="h")
ft_mm_table=full_table_mm.loc[['Govt & Agencies Instit','Govt & Agency Retail']]
ft_mm_table=ft_mm_table[['1-Yr (NCAGR)','30-DSY (wk/mo)', 'Portfolio Assets (mo/$mils)','netflow', '1-DSY (dly)', 'Incd Expense Ratio (mo)', 'Incd Mgmt Fee (mo)', 'Incd Other Fees (mo)', 'Incd Shr. Svc. Fee (mo)','fund_size_large','year','month','Fitch Rating', "Moody's Rating", 'S&P Rating', 'flag','target']]
mm_matrix=pps.matrix(ft_mm_table)
def heatmap(df):
ax = sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
ax.set_title('Govt & Agency-PPS matrix')
ax.set_xlabel('feature')
ax.set_ylabel('target')
return ax
sns.set(font_scale=1.4)
plt.subplots(figsize=(25,20))
heatmap(mm_matrix)
ft_mm_table_sub = ft_mm_table.drop(['target'],axis =1)
#creat LinearRegression dataset
x = ft_mm_table_sub.drop(["netflow"],axis =1)
y = ft_mm_table_sub['netflow']
categorical = [col for col in x.columns if x[col].dtypes == 'O']
numerical = [col for col in x.columns if x[col].dtypes != 'O']
# encode categorical variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['Fitch Rating', "Moody's Rating", 'S&P Rating', 'flag','fund_size_large'
])
x = encoder.fit_transform(x)
#split dataset
np.random.seed(seed=42)
x['random']=np.random.random(size=len(x))
x_train,x_valid,y_train,y_valid = train_test_split(x,y,test_size = 0.8, random_state =42)
rf = RandomForestRegressor(n_estimators = 1000,
n_jobs = -1,
oob_score = True,
bootstrap = True,
random_state = 42)
rf.fit(x_train,y_train)
print('R^2 training score: {:.2f} \n00B Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(x_train,y_train),
rf.oob_score_,
rf.score(x_valid,y_valid)))
#normalize target
from sklearn.preprocessing import RobustScaler,MinMaxScaler
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)
cols = x.columns
scaled_x = pd.DataFrame(scaled_x, columns=[cols])
#scaled_x
score_lr = cross_val_score(LinearRegression(),scaled_x,y,cv=RepeatedKFold(n_repeats = 10))
score_rf = cross_val_score(RandomForestRegressor(n_estimators = 100),scaled_x,y,cv = RepeatedKFold(n_repeats = 5))
print("\nRandom Forest Mean Validation Score:",np.mean(score_rf))
def r2(rf,x_train,y_train):
return r2_score(y_train,rf.predict(x_train))
perm_importance_rfpimp = permutation_importances(rf,x_train,y_train,r2)
perm_importance_rfpimp.reset_index(drop = False, inplace = True)
perm_importance_rfpimp
subset=full_table_mm.loc[full_table_mm['SubCategory Current']=='Govt & Agencies Instit']
q_hi = subset["netflow"].quantile(0.99)
q_low = subset["netflow"].quantile(0.01)
subset=subset[(subset["netflow"]<q_hi) & (subset["netflow"]>q_low)]
H0:there is no difference between large funds and small funds in terms of netflow
H1: netflow is different with large and small funds
we want to know whether average of each group is significantly with the other group.
The main idea of t-test is to check whether the observed value is strong than the variation on the data.
#t-test on Large fund vs. Samll fund
large_fund = subset.loc[subset['fund_size_large'] == 'yes',:] #define large fund df
small_fund = subset.loc[subset['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
ax = plt.subplots(figsize = (18,8))
#plot the small fund ncagr distribution
sns.distplot(small_fund['netflow'],bins=20, hist = True, label = 'small fund')
#plot the large fund ncagr distribution
sns.distplot(large_fund['netflow'],bins=20, hist = True, label = 'large fund')
plt.legend() #show legend
#boxplot - test assumption
ax = sns.boxplot(y="fund_size_large",x="netflow",data=first_tier_re,orient="h")
subset=full_table_mm.loc[full_table_mm['SubCategory Current']=='Govt & Agency Retail']
q_hi = subset["netflow"].quantile(0.99)
q_low = subset["netflow"].quantile(0.01)
subset=subset[(subset["netflow"]<q_hi) & (subset["netflow"]>q_low)]
H0:there is no difference between large funds and small funds in terms of netflow
H1: netflow is different with large and small funds
we want to know whether average of each group is significantly with the other group.
The main idea of t-test is to check whether the observed value is strong than the variation on the data.
#t-test on Large fund vs. Samll fund
large_fund = subset.loc[subset['fund_size_large'] == 'yes',:] #define large fund df
small_fund = subset.loc[subset['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
ax = plt.subplots(figsize = (18,8))
#plot the small fund ncagr distribution
sns.distplot(small_fund['netflow'],bins=20, hist = True, label = 'small fund')
#plot the large fund ncagr distribution
sns.distplot(large_fund['netflow'],bins=20, hist = True, label = 'large fund')
plt.legend() #show legend
#boxplot - test assumption
ax = sns.boxplot(y="fund_size_large",x="netflow",data=first_tier_re,orient="h")
ft_mm_table=full_table_mm.loc[['Treas & Repo Retail','Treasury & Repo Instit']]
ft_mm_table=ft_mm_table[['1-Yr (NCAGR)','30-DSY (wk/mo)', 'Portfolio Assets (mo/$mils)','netflow', '1-DSY (dly)', 'Incd Expense Ratio (mo)', 'Incd Mgmt Fee (mo)', 'Incd Other Fees (mo)', 'Incd Shr. Svc. Fee (mo)','fund_size_large','year','month','Fitch Rating', "Moody's Rating", 'S&P Rating', '12b_1','target']]
mm_matrix=pps.matrix(ft_mm_table)
def heatmap(df):
ax = sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
ax.set_title('Treasury & Repo -PPS matrix')
ax.set_xlabel('feature')
ax.set_ylabel('target')
return ax
sns.set(font_scale=1.4)
plt.subplots(figsize=(25,20))
heatmap(mm_matrix)
ft_mm_table_sub = ft_mm_table.drop(['target'],axis =1)
#creat LinearRegression dataset
x = ft_mm_table_sub.drop(["netflow"],axis =1)
y = ft_mm_table_sub['netflow']
categorical = [col for col in x.columns if x[col].dtypes == 'O']
numerical = [col for col in x.columns if x[col].dtypes != 'O']
# encode categorical variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['Fitch Rating', "Moody's Rating", 'S&P Rating', '12b_1','fund_size_large'
])
x = encoder.fit_transform(x)
#split dataset
np.random.seed(seed=42)
x['random']=np.random.random(size=len(x))
x_train,x_valid,y_train,y_valid = train_test_split(x,y,test_size = 0.8, random_state =42)
rf = RandomForestRegressor(n_estimators = 1000,
n_jobs = -1,
oob_score = True,
bootstrap = True,
random_state = 42)
rf.fit(x_train,y_train)
print('R^2 training score: {:.2f} \n00B Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(x_train,y_train),
rf.oob_score_,
rf.score(x_valid,y_valid)))
#normalize target
from sklearn.preprocessing import RobustScaler,MinMaxScaler
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)
cols = x.columns
scaled_x = pd.DataFrame(scaled_x, columns=[cols])
#scaled_x
score_lr = cross_val_score(LinearRegression(),scaled_x,y,cv=RepeatedKFold(n_repeats = 10))
score_rf = cross_val_score(RandomForestRegressor(n_estimators = 100),scaled_x,y,cv = RepeatedKFold(n_repeats = 5))
print("\nRandom Forest Mean Validation Score:",np.mean(score_rf))
def r2(rf,x_train,y_train):
return r2_score(y_train,rf.predict(x_train))
perm_importance_rfpimp = permutation_importances(rf,x_train,y_train,r2)
perm_importance_rfpimp.reset_index(drop = False, inplace = True)
perm_importance_rfpimp
subset=full_table_mm.loc[full_table_mm['SubCategory Current']=='Treasury & Repo Instit']
q_hi = subset["netflow"].quantile(0.99)
q_low = subset["netflow"].quantile(0.01)
subset=subset[(subset["netflow"]<q_hi) & (subset["netflow"]>q_low)]
H0:there is no difference between large funds and small funds in terms of netflow
H1: netflow is different with large and small funds
we want to know whether average of each group is significantly with the other group.
The main idea of t-test is to check whether the observed value is strong than the variation on the data.
#t-test on Large fund vs. Samll fund
large_fund = subset.loc[subset['fund_size_large'] == 'yes',:] #define large fund df
small_fund = subset.loc[subset['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
ax = plt.subplots(figsize = (18,8))
#plot the small fund ncagr distribution
sns.distplot(small_fund['netflow'],bins=20, hist = True, label = 'small fund')
#plot the large fund ncagr distribution
sns.distplot(large_fund['netflow'],bins=20, hist = True, label = 'large fund')
plt.legend() #show legend
#boxplot - test assumption
ax = sns.boxplot(y="fund_size_large",x="netflow",data=first_tier_re,orient="h")
subset=full_table_mm.loc[full_table_mm['SubCategory Current']=='Treas & Repo Retail']
q_hi = subset["netflow"].quantile(0.99)
q_low = subset["netflow"].quantile(0.01)
subset=subset[(subset["netflow"]<q_hi) & (subset["netflow"]>q_low)]
H0:there is no difference between large funds and small funds in terms of netflow
H1: netflow is different with large and small funds
we want to know whether average of each group is significantly with the other group.
The main idea of t-test is to check whether the observed value is strong than the variation on the data.
#t-test on Large fund vs. Samll fund
large_fund = subset.loc[subset['fund_size_large'] == 'yes',:] #define large fund df
small_fund = subset.loc[subset['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
ax = plt.subplots(figsize = (18,8))
#plot the small fund ncagr distribution
sns.distplot(small_fund['netflow'],bins=20, hist = True, label = 'small fund')
#plot the large fund ncagr distribution
sns.distplot(large_fund['netflow'],bins=20, hist = True, label = 'large fund')
plt.legend() #show legend
#boxplot - test assumption
ax = sns.boxplot(y="fund_size_large",x="netflow",data=first_tier_re,orient="h")
ft_mm_table=full_table_mm.loc[['T-F National Inst','T-F National Retail']]
ft_mm_table=ft_mm_table[['1-Yr (NCAGR)','30-DSY (wk/mo)', 'Portfolio Assets (mo/$mils)','netflow', '1-DSY (dly)', 'Incd Expense Ratio (mo)', 'Incd Mgmt Fee (mo)', 'Incd Other Fees (mo)', 'Incd Shr. Svc. Fee (mo)','fund_size_large','year','month','Fitch Rating', "Moody's Rating", 'S&P Rating', '12b_1','target']]
mm_matrix=pps.matrix(ft_mm_table)
def heatmap(df):
ax = sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
ax.set_title('T-F National-PPS matrix')
ax.set_xlabel('feature')
ax.set_ylabel('target')
return ax
sns.set(font_scale=1.4)
plt.subplots(figsize=(25,20))
heatmap(mm_matrix)
ft_mm_table_sub = ft_mm_table.drop(['target'],axis =1)
#creat LinearRegression dataset
x = ft_mm_table_sub.drop(["netflow"],axis =1)
y = ft_mm_table_sub['netflow']
categorical = [col for col in x.columns if x[col].dtypes == 'O']
numerical = [col for col in x.columns if x[col].dtypes != 'O']
# encode categorical variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['Fitch Rating', "Moody's Rating", 'S&P Rating', '12b_1','fund_size_large'
])
x = encoder.fit_transform(x)
#split dataset
np.random.seed(seed=42)
x['random']=np.random.random(size=len(x))
x_train,x_valid,y_train,y_valid = train_test_split(x,y,test_size = 0.8, random_state =42)
rf = RandomForestRegressor(n_estimators = 1000,
n_jobs = -1,
oob_score = True,
bootstrap = True,
random_state = 42)
rf.fit(x_train,y_train)
print('R^2 training score: {:.2f} \n00B Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(x_train,y_train),
rf.oob_score_,
rf.score(x_valid,y_valid)))
#normalize target
from sklearn.preprocessing import RobustScaler,MinMaxScaler
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)
cols = x.columns
scaled_x = pd.DataFrame(scaled_x, columns=[cols])
#scaled_x
score_lr = cross_val_score(LinearRegression(),scaled_x,y,cv=RepeatedKFold(n_repeats = 10))
score_rf = cross_val_score(RandomForestRegressor(n_estimators = 100),scaled_x,y,cv = RepeatedKFold(n_repeats = 5))
print("\nRandom Forest Mean Validation Score:",np.mean(score_rf))
Even after normalizing the features, the validation score is still not high. The randomness within this category is strong.
def r2(rf,x_train,y_train):
return r2_score(y_train,rf.predict(x_train))
perm_importance_rfpimp = permutation_importances(rf,x_train,y_train,r2)
perm_importance_rfpimp.reset_index(drop = False, inplace = True)
perm_importance_rfpimp
Based on the above result, we can also observe that the random factor is high in predicting the netflow.
subset=full_table_mm.loc[full_table_mm['SubCategory Current']=='T-F National Inst']
q_hi = subset["netflow"].quantile(0.99)
q_low = subset["netflow"].quantile(0.01)
subset=subset[(subset["netflow"]<q_hi) & (subset["netflow"]>q_low)]
H0:there is no difference between large funds and small funds in terms of netflow
H1: netflow is different with large and small funds
we want to know whether average of each group is significantly with the other group.
The main idea of t-test is to check whether the observed value is strong than the variation on the data.
#t-test on Large fund vs. Samll fund
large_fund = subset.loc[subset['fund_size_large'] == 'yes',:] #define large fund df
small_fund = subset.loc[subset['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
ax = plt.subplots(figsize = (18,8))
#plot the small fund ncagr distribution
sns.distplot(small_fund['netflow'],bins=20, hist = True, label = 'small fund')
#plot the large fund ncagr distribution
sns.distplot(large_fund['netflow'],bins=20, hist = True, label = 'large fund')
plt.legend() #show legend
#boxplot - test assumption
ax = sns.boxplot(y="fund_size_large",x="netflow",data=first_tier_re,orient="h")
subset=full_table_mm.loc[full_table_mm['SubCategory Current']=='T-F National Retail']
q_hi = subset["netflow"].quantile(0.99)
q_low = subset["netflow"].quantile(0.01)
subset=subset[(subset["netflow"]<q_hi) & (subset["netflow"]>q_low)]
H0:there is no difference between large funds and small funds in terms of netflow
H1: netflow is different with large and small funds
we want to know whether average of each group is significantly with the other group.
The main idea of t-test is to check whether the observed value is strong than the variation on the data.
#t-test on Large fund vs. Samll fund
large_fund = subset.loc[subset['fund_size_large'] == 'yes',:] #define large fund df
small_fund = subset.loc[subset['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
ft_mm_table=full_table_mm.loc[['Treasury Instit','Treasury Retail']]
ft_mm_table=ft_mm_table[['1-Yr (NCAGR)','30-DSY (wk/mo)', 'Portfolio Assets (mo/$mils)','netflow', '1-DSY (dly)', 'Incd Expense Ratio (mo)', 'Incd Mgmt Fee (mo)', 'Incd Other Fees (mo)', 'Incd Shr. Svc. Fee (mo)','fund_size_large','year','month','Fitch Rating', "Moody's Rating", 'S&P Rating', '12b_1','target']]
mm_matrix=pps.matrix(ft_mm_table)
def heatmap(df):
ax = sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
ax.set_title('Treasury-PPS matrix')
ax.set_xlabel('feature')
ax.set_ylabel('target')
return ax
sns.set(font_scale=1.4)
plt.subplots(figsize=(25,20))
heatmap(mm_matrix)
ft_mm_table_sub = ft_mm_table.drop(['target'],axis =1)
#creat LinearRegression dataset
x = ft_mm_table_sub.drop(["netflow"],axis =1)
y = ft_mm_table_sub['netflow']
categorical = [col for col in x.columns if x[col].dtypes == 'O']
numerical = [col for col in x.columns if x[col].dtypes != 'O']
# encode categorical variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['Fitch Rating', "Moody's Rating", 'S&P Rating', '12b_1','fund_size_large'
])
x = encoder.fit_transform(x)
#split dataset
np.random.seed(seed=42)
x['random']=np.random.random(size=len(x))
x_train,x_valid,y_train,y_valid = train_test_split(x,y,test_size = 0.8, random_state =42)
rf = RandomForestRegressor(n_estimators = 1000,
n_jobs = -1,
oob_score = True,
bootstrap = True,
random_state = 42)
rf.fit(x_train,y_train)
print('R^2 training score: {:.2f} \n00B Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(x_train,y_train),
rf.oob_score_,
rf.score(x_valid,y_valid)))
#normalize target
from sklearn.preprocessing import RobustScaler,MinMaxScaler
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)
cols = x.columns
scaled_x = pd.DataFrame(scaled_x, columns=[cols])
#scaled_x
score_lr = cross_val_score(LinearRegression(),scaled_x,y,cv=RepeatedKFold(n_repeats = 10))
score_rf = cross_val_score(RandomForestRegressor(n_estimators = 100),scaled_x,y,cv = RepeatedKFold(n_repeats = 5))
print("\nRandom Forest Mean Validation Score:",np.mean(score_rf))
def r2(rf,x_train,y_train):
return r2_score(y_train,rf.predict(x_train))
perm_importance_rfpimp = permutation_importances(rf,x_train,y_train,r2)
perm_importance_rfpimp.reset_index(drop = False, inplace = True)
perm_importance_rfpimp
subset=full_table_mm.loc[full_table_mm['SubCategory Current']=='Treasury Instit']
q_hi = subset["netflow"].quantile(0.99)
q_low = subset["netflow"].quantile(0.01)
subset=subset[(subset["netflow"]<q_hi) & (subset["netflow"]>q_low)]
H0:there is no difference between large funds and small funds in terms of netflow
H1: netflow is different with large and small funds
we want to know whether average of each group is significantly with the other group.
The main idea of t-test is to check whether the observed value is strong than the variation on the data.
#t-test on Large fund vs. Samll fund
large_fund = subset.loc[subset['fund_size_large'] == 'yes',:] #define large fund df
small_fund = subset.loc[subset['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
subset=full_table_mm.loc[full_table_mm['SubCategory Current']=='Treasury Retail']
q_hi = subset["netflow"].quantile(0.99)
q_low = subset["netflow"].quantile(0.01)
subset=subset[(subset["netflow"]<q_hi) & (subset["netflow"]>q_low)]
H0:there is no difference between large funds and small funds in terms of netflow
H1: netflow is different with large and small funds
we want to know whether average of each group is significantly with the other group.
The main idea of t-test is to check whether the observed value is strong than the variation on the data.
#t-test on Large fund vs. Samll fund
large_fund = subset.loc[subset['fund_size_large'] == 'yes',:] #define large fund df
small_fund = subset.loc[subset['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
full_table_mm['year'] = pd.DatetimeIndex(full_table_mm['DATE']).year
full_table_mm['month'] = pd.DatetimeIndex(full_table_mm['DATE']).month
ft_mm_table = full_table_mm
#ft_mm_table = ft_mm_table.set_index('SubCategory Current',inplace = True)
ft_mm_table=full_table_mm.loc[['T-F State Inst','T-F State Retail']]
ft_mm_table=ft_mm_table[['1-Yr (NCAGR)','30-DSY (wk/mo)', 'Portfolio Assets (mo/$mils)','netflow', '1-DSY (dly)', 'Incd Expense Ratio (mo)', 'Incd Mgmt Fee (mo)', 'Incd Other Fees (mo)', 'Incd Shr. Svc. Fee (mo)','fund_size_large','year','month','Fitch Rating', "Moody's Rating", 'S&P Rating','target']]
mm_matrix=pps.matrix(ft_mm_table)
def heatmap(df):
ax = sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
ax.set_title('T-F State matrix')
ax.set_xlabel('feature')
ax.set_ylabel('target')
return ax
sns.set(font_scale=1.4)
plt.subplots(figsize=(25,20))
heatmap(mm_matrix)
Note: All T-F State category funds have no rating attributes
#Drop rating attributes for T-F State category
ft_mm_table_sub = ft_mm_table.drop(['target','Fitch Rating', "Moody's Rating", 'S&P Rating'],axis =1)
#creat LinearRegression dataset
x = ft_mm_table_sub.drop(["netflow"],axis =1)
y = ft_mm_table_sub['netflow']
categorical = [col for col in x.columns if x[col].dtypes == 'O']
numerical = [col for col in x.columns if x[col].dtypes != 'O']
# encode categorical variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['12b_1','fund_size_large'
])
x = encoder.fit_transform(x)
#split dataset and add a random attribute
np.random.seed(seed=42)
x['random']=np.random.random(size=len(x))
x_train,x_valid,y_train,y_valid = train_test_split(x,y,test_size = 0.8, random_state =42)
rf = RandomForestRegressor(n_estimators = 1000,
n_jobs = -1,
oob_score = True,
bootstrap = True,
random_state = 42)
rf.fit(x_train,y_train)
print('R^2 training score: {:.2f} \n00B Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(x_train,y_train),
rf.oob_score_,
rf.score(x_valid,y_valid)))
#normalize target
from sklearn.preprocessing import RobustScaler,MinMaxScaler
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)
cols = x.columns
scaled_x = pd.DataFrame(scaled_x, columns=[cols])
#scaled_x
score_lr = cross_val_score(LinearRegression(),scaled_x,y,cv=RepeatedKFold(n_repeats = 10))
score_rf = cross_val_score(RandomForestRegressor(n_estimators = 1000),scaled_x,y,cv = RepeatedKFold(n_repeats = 4))
print("\nRandom Forest Mean Validation Score:",np.mean(score_rf))
def r2(rf,x_train,y_train):
return r2_score(y_train,rf.predict(x_train))
perm_importance_rfpimp = permutation_importances(rf,x_train,y_train,r2)
perm_importance_rfpimp.reset_index(drop = False, inplace = True)
perm_importance_rfpimp
subset=full_table_mm.loc[full_table_mm['SubCategory Current']=='T-F State Retail']
q_hi = subset["netflow"].quantile(0.99)
q_low = subset["netflow"].quantile(0.01)
subset=subset[(subset["netflow"]<q_hi) & (subset["netflow"]>q_low)]
H0:there is no difference between large funds and small funds in terms of netflow
H1: netflow is different with large and small funds
we want to know whether average of each group is significantly with the other group.
The main idea of t-test is to check whether the observed value is strong than the variation on the data.
#t-test on Large fund vs. Samll fund
large_fund = subset.loc[subset['fund_size_large'] == 'yes',:] #define large fund df
small_fund = subset.loc[subset['fund_size_large'] == 'no',:] #define small fund df
ttest_ind(large_fund['netflow'],small_fund['netflow'],equal_var = False)
import seaborn as sns
sns.set(style='white')
file_name_4 ="MM_citi.csv"
obj_1 = s3.get_object(Bucket=bucket, Key = file_name_4)
lm_ts = pd.read_csv(obj_1['Body'])
file_name_5 ="HistoricalData.csv"
obj_1 = s3.get_object(Bucket=bucket, Key = file_name_5)
oil_p = pd.read_csv(obj_1['Body'])
lm_ts = lm_ts.drop(['File Paths','Table Names','Social Code','Cusip Number', 'Dealer','Company','Money Market Indicator'],axis=1)
oil_p.dtypes
#change data type
oil_p['Date'] = pd.to_datetime(oil_p['Date'])
lm_ts['Balance Date'] = pd.to_datetime(lm_ts['Balance Date'])
#merge oil price with lm client
merged_lm = pd.merge(oil_p, lm_ts, left_on='Date', right_on = 'Balance Date', how = 'right')
#Total Dollar Balance is skwed to left, perform log transformation
merged_lm['log_tdb'] = np.log(merged_lm['Total Dollar Balance'])
merged_lm.head()
merged_lm = merged_lm.dropna(subset=['Date'])
sns.pairplot(merged_lm,size=2.5)
plt.show
after log transformation, the correlation between oil price and total asset balance is still insignificant
# Correlation btw oil price and total asset balance group by clients
merged_lm.groupby('Name Only Block')[['Low','log_tdb']].corr()
from pyramid.arima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_pacf,plot_acf
from statsmodels.tsa.seasonal import seasonal_decompose
WA_fund1=edh_mm.loc[edh_mm['cusip']=='52470G791']
WA_fund2=df_data_1.loc[df_data_1['CUSIP']=='52470G791']
WA_fund2=WA_fund2[['Date','Portfolio Assets (mo/$mils)']]
WA_fund1=WA_fund1[['as_of_dt','netflow']]
#WA_fund2
#WA_fund2.set_index('Date', inplace=True)
WA_fund2.Date = pd.to_datetime(WA_fund2.Date)
WA_fund2.info()
result = seasonal_decompose(WA_fund2,freq=12)
sns.set(rc={'figure.figsize':(20,5)})
result.plot()
By observing the seasonal trend, we can see that there is surely a seasonal cmponent in the data - multiplicative
plot_acf(WA_fund2)
auto_arima(WA_fund2['Portfolio Assets (mo/$mils)'],seasonal=True,m=12,suppress_warnings=True,information_criterion='aic',max_P=5,max_D=5,max_Q=5,max_p=5,max_d=5,max_q=5).summary()
train = WA_fund2[:100]
test = WA_fund2[100:]
WA_fund2['y']=WA_fund2['Portfolio Assets (mo/$mils)']
WA_fund2['ds']=WA_fund2['Date']
from fbprophet import Prophet
m=Prophet(seasonality_mode='multiplicative')
m.fit(WA_fund2)
future=m.make_future_dataframe(periods=24,freq='M')
forecast=m.predict(future)
forecast
figure=m.plot(forecast,xlabel='Date',ylabel='Portfolio Asset')
figure=m.plot_components(forecast)
#fed_rate
#WA_fund2
fed_rate_ts = fed_rate
#fed_rate_ts.set_index('DATE')
WA_fund2_regressor=pd.merge(fed_rate_ts,WA_fund2, left_on ='DATE', right_on = 'Date')
def effr_envior(EFFR):
if EFFR > 0.25:
return 1
else:
return 0
WA_fund2_regressor.dtypes
WA_fund2_regressor['EFFR'].fillna(WA_fund2_regressor['EFFR'].median(), inplace = True)
m=Prophet()
m.add_regressor('EFFR',standardize=False)
m.fit(WA_fund2_regressor)
WA_fund2_regressor=WA_fund2_regressor[['y','ds','EFFR']]
#future=m.make_future_dataframe(periods=24,freq='M')
future['EFFR']=WA_fund2_regressor['EFFR']
future.reset_index(drop=True)
future['EFFR'].fillna(future['EFFR'].median(), inplace = True)
forecast=m.predict(future)
forecast
figure=m.plot(forecast,xlabel='Date',ylabel='Portfolio Asset')
figure=m.plot_components(forecast)